home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Games of Daze
/
Infomagic - Games of Daze (Summer 1995) (Disc 1 of 2).iso
/
x2ftp
/
msdos
/
ai
/
fuzzy
/
token.b
< prev
next >
Wrap
Text File
|
1986-11-29
|
31KB
|
800 lines
-------------------------------------------------------------------------------
-- --
-- Library Unit: Token -- Get token package --
-- --
-- Author: Bradley L. Richards --
-- --
-- Version Date Notes . . . --
-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - --
-- 1.0 6 Feb 86 Initial Version --
-- 1.1 25 Feb 86 All basic routines completed --
-- 1.2 13 Mar 86 Added reserved words. Split Ada and Fuzzy --
-- Prolog via conditional compilation --
-- 1.3 22 May 86 Revised lots of Fuzzy Prolog stuff to make it --
-- work; adding reserved words, etc.. --
-- 1.4 19 Jun 86 Use revised io package and data_def --
-- 2.0 20 Jun 86 Token_type extracted into package Data_def --
-- 2.05 13 Jul 86 Split into separate spec and body files --
-- 2.1 21 Jul 86 Demonstration Version --
-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - --
-- --
-- Library units used: io -- read source file and produce listing --
-- listing -- insert messages in listing --
-- data_def -- common data definitions --
-- --
-- Description: This package reads the source file (via "io") and parses --
-- out individual tokens. There are three defined types which calling --
-- routines must use. Token_type defines the legal kinds of tokens --
-- which may be returned. Token_record is a variant record (with a --
-- discriminant of token_type) which is used to hold tokens. Finally, --
-- token_ptr is the access type to the dynamically allocated --
-- token_records. --
-- This package must be initialized by calling start_token. Then --
-- subsequent calls to get_token will return one token each. Note --
-- that end-of-file is indicated by the special token "end_of_file." --
-- --
-- Note that this package is designed to be used with multiple --
-- languages. In order to share the code and ensure that generic --
-- changes are made to all versions, this package is kept in a master --
-- source file which contains conditional compilation directives --
-- for use by the Ada preprocessor "pp." --
-- --
-------------------------------------------------------------------------------
-- --
-- Package Body --
-- --
-------------------------------------------------------------------------------
package body token is
seen_end_of_file : boolean;
procedure get_token is separate;
--------------------------------------------
-- Initialization and Utility functions --
-- in alphabetical order --
--------------------------------------------
--
-- start_token -- This routine makes the first call to get_char, so that
-- get_token will have something to look at.
--
procedure start_token(source_file, listing_file : in string) is
begin
seen_end_of_file := false;
--
-- for Fuzzy Prolog use a lookahead of 1. A lookahead of 2 is too
-- awkward to interactive i/o
--
start_io(source_file, listing_file, 1);
start_listing;
get_char; -- get first character
end start_token;
--
-- stop_token -- This routine cleans up whatever needs to be done at the
-- end of parsing a file
--
procedure stop_token is
begin
stop_io;
end stop_token;
--
-- skip_rest_of_token -- When a token_fetching routine encounters an error
-- it generally returns the last valid value it had,
-- and then wants to skip over the rest of the
-- erroneous token. This routine implements this by
-- skipping characters until it encounters one which
-- should not be embedded in a token.
-- Note that it will stop on many characters which
-- are not legal delimiters in the language. This
-- allows these characters to be flagged as seperate
-- errors.
--
procedure skip_rest_of_token is
begin
while not valid_ending(look_ahead_char) loop
get_char;
end loop;
end skip_rest_of_token;
--
-- valid_ending -- This routine looks for characters which signal the end
-- of a token. These characters may or may not technically
-- be delimiters.
--
-- Characters accepted by this routine are:
--
-- space ! $ % & ( ) * + , - / : ; < = > ? @ [ \ ] ^ { | } ~ eot cr tab
--
-- Characters not accepted: control characters (except eot, tab, & cr)
-- digits, letters
-- " # ' . _ ` ascii.rub
--
function valid_ending( char : character ) return boolean is
begin
if (char in ' '..'!') or
(char in '$'..'&') or
(char in '('..'-') or
(char = '/') or
(char in ':'..'@') or
(char in '['..'^') or
(char in '{'..'~') or
(char = ascii.ht) or
(char = ascii.eot) or
(char = ascii.cr)
then return true;
else return false;
end if;
end valid_ending;
end token;
--
-- get_token -- This routine parses and returns the next token in the source
-- file. It expects look_ahead_char (the package variable) to be set
-- to the next character to be processed. The current character,
-- therefore, has already been parsed. All subroutines which
-- get_token calls to handle the various token types must follow
-- this convention. The subroutine which finally identifies the
-- token is allocates and defines the token_record which is to
-- be returned. The only time get_token itself defines the
-- token_record is for the end_of_file.
--
separate(token)
procedure get_token is
token : token_ptr;
have_token : boolean := false;
--
-- All token handling routines are separate. This makes the code
-- easier to read than if several hundred lines were embedded here.
--
procedure get_character( token : out token_ptr) is separate;
procedure get_comment_or_minus( token : out token_ptr) is separate;
procedure get_greater_than( token : out token_ptr) is separate;
procedure get_identifier( token : out token_ptr) is separate;
procedure get_number( token : out token_ptr) is separate;
procedure get_string( token : out token_ptr) is separate;
procedure get_fuzzy_backslash( token : out token_ptr) is separate;
procedure get_fuzzy_colon( token : out token_ptr) is separate;
procedure get_fuzzy_equal( token : out token_ptr) is separate;
procedure get_fuzzy_underline( token : out token_ptr) is separate;
begin -- get_token
loop
case look_ahead_char is
--
-- skip embedded spaces
--
when ' ' => loop
get_char;
exit when look_ahead_char /= ' ';
end loop;
--
-- handle multi-character tokens
--
when 'A'..'Z' => get_identifier(token); have_token := true;
when 'a'..'z' => get_identifier(token); have_token := true;
when '0'..'9' => get_number(token); have_token := true;
when '-' | '{' => get_comment_or_minus(token);
if token.is_a /= null_token then
have_token := true;
end if;
when '"' => get_string(token); have_token := true;
when ''' => get_character(token); have_token := true;
when '>' => get_greater_than(token); have_token := true;
--
-- special cases: tab, end-of-line, and end-of-file
--
when ascii.ht => get_char;
when ascii.cr => get_char;
when ascii.eot => token := new token_record'(is_a => end_of_file);
have_token := true;
if seen_end_of_file = true then
raise unexpected_end_of_file;
else seen_end_of_file := true;
end if;
when '=' => get_fuzzy_equal(token);
if token.is_a /= null_token then
have_token := true;
end if;
when ':' => get_fuzzy_colon(token);
if token.is_a /= null_token then
have_token := true;
end if;
--
-- now a sub-case statement to handle single character tokens
--
when others =>
get_char;
have_token := true; -- almost certainly true
case current_char is
when '(' => token := new token_record'(is_a => left_paren);
when ')' => token := new token_record'(is_a => right_paren);
when '*' => token := new token_record'(is_a => asterisk);
when '+' => token := new token_record'(is_a => plus);
when ',' => token := new token_record'(is_a => comma);
when ';' => token := new token_record'(is_a => semicolon);
when '|' => token := new token_record'(is_a => bar);
when '!' => token := new token_record'(is_a => cut);
when '/' => token := new token_record'(is_a => slash);
when '<' => token := new token_record'(is_a => less_than);
when '[' => token := new token_record'(is_a => left_bracket);
when ']' => token := new token_record'(is_a => right_bracket);
when '^' => token := new token_record'(is_a => hat);
when '.' => token := new token_record'(is_a => period);
when '\' => get_fuzzy_backslash(token);
if token.is_a /= null_token then
have_token := true;
end if;
when '_' => get_fuzzy_underline(token);
if token.is_a /= null_token then
have_token := true;
end if;
when others => error(pointer,"illegal character");
skip_rest_of_token;
have_token := false;
end case;
end case;
exit when have_token;
end loop;
current_token := token;
end get_token;
-------------------------------------------------------------------------------
-- --
-- Token Fetching Routines --
-- --
-------------------------------------------------------------------------------
--
-- get_character -- This subroutine expects to see a single character enclosed
-- in single quotes. Since this syntax is strictly defined,
-- there is no confusion when the character is a single
--
-- Syntax: character_spec ::= ' ascii.character '
--
-- Examples: 'a' 'M' '#' ''' 'z'
--
separate(token.get_token)
procedure get_character( token : out token_ptr) is
begin
get_char;
if (look_ahead_char = ascii.eot) or (look_ahead_char = ascii.cr) then
error(pointer,"invalid character literal");
else
get_char;
token := new token_record'(character_lit, current_char);
if look_ahead_char /= ''' then
error(pointer,"invalid character literal");
skip_rest_of_token;
else
get_char;
end if;
end if;
end get_character;
--
-- get_comment_or_minus -- This routine handles two forms of comments. The
-- first is initiated by two adjacent dashes and
-- terminated by the end-of-line. If only a single
-- dash is found, a "minus" token is returned. The
-- second form of comment is enclosed within scroll
-- brackets, and may cover multiple lines. Nesting
-- level of the brackets is tracked, so comments may
-- be nested. Comments return a "null_token."
--
-- Syntax: minus ::= '-'
-- comment ::= '--' comment 'ascii.cr' | '{' comment '}'
--
separate(token.get_token)
procedure get_comment_or_minus( token : out token_ptr) is
nesting_level : natural := 1;
begin
get_char;
if current_char = '-' then
if look_ahead_char /= '-' then
token := new token_record'(is_a => minus);
else
loop
get_char;
exit when (current_char = ascii.cr) or (current_char = ascii.eot);
end loop;
token := new token_record'(is_a => null_token);
end if;
else -- current_char = '{'
loop
get_char;
if current_char = '}' then
nesting_level := nesting_level - 1;
elsif current_char = '{' then
nesting_level := nesting_level + 1;
elsif current_char = ascii.eot then
error(pointer, "unterminated comment block");
nesting_level := 0;
end if;
exit when (nesting_level = 0);
end loop;
token := new token_record'(is_a => null_token);
end if;
end get_comment_or_minus;
--
-- get_greater_than -- The two tokens beginning with '>' are the
-- "greater_than" and the "greater_or_equal"
--
separate(token.get_token)
procedure get_greater_than( token : out token_ptr) is
begin
get_char;
if look_ahead_char = '=' then
get_char;
token := new token_record'(is_a => greater_or_equal);
else
token := new token_record'(is_a => greater_than);
end if;
end get_greater_than;
--
-- get_identifier -- Identifiers must begin with a letter (either upper or
-- lower case), and may then contain both letters and
-- digits. Underlines may be embedded, but must separate
-- letters and digits. Case is significant, only for
-- Fuzzy Prolog, and only in that the first character, if
-- capitalized, indicates that the identifier is a variable.
-- Underlines are significant in all identifiers. The only
-- limit on identifier length is line length, which is
-- controlled by package "special.io"
--
-- Syntax: identifier ::= letter { [ '_' ] letter_or_digit }
--
separate(token.get_token)
procedure get_identifier( token : out token_ptr) is
ptr : integer range 0..io.max_line_length := 0;
ident_name : string(1..io.max_line_length) := (others => ' ');
ident : name_ptr;
err_flg : boolean := false;
reserved : boolean := false;
convert : constant integer := character'pos('a') - character'pos('A');
var_flg : boolean := false;
--
-- This routine checks the identifier against the list of reserved
-- words. If it is reserved, then "token" is set appropriately and
-- reserved is true. The search method used is a simplistic hash table.
--
procedure check_reserved(length : in integer; ident : in string;
reserved : out boolean; token : out token_ptr) is
type word_record is
record
word : string(1..9);
rw_token : token_type;
end record;
char_pos : constant array ('A'..'Z',1..2) of integer
:= ( (1,4), (1,0), (5,7),
(8,9), (1,0), (10,13), (14,15), (1,0), (16,17),
(1,0), (1,0), (18,20), (21,21), (22,29), (30,31),
(32,33), (1,0), (34,37), (38,42), (43,49), (50,50),
(51,51), (52,52), (1,0), (1,0), (1,0) );
words : constant array(1..52) of word_record :=
( ("ASSERTA ", rw_asserta), ("ASSERTZ ", rw_assertz),
("ATOM ", rw_atom), ("ATOMIC ", rw_atomic),
("CALL ", rw_call), ("CLAUSE ", rw_clause),
("CONSULT ", rw_consult), ("DEBUGGING", rw_debugging),
("DISPLAY ", rw_display), ("FAIL ", rw_fail),
("FLOAT ", rw_float), ("FUNCTOR ", rw_functor),
("FUZZY ", rw_fuzzy), ("GET ", rw_get),
("GET0 ", rw_get0), ("INTEGER ", rw_integer),
("IS ", rw_is), ("LISTING ", rw_listing),
("LN ", rw_ln), ("LOG ", rw_log),
("MOD ", rw_mod), ("NAME ", rw_name),
("NL ", rw_nl), ("NODEBUG ", rw_nodebug),
("NONVAR ", rw_nonvar), ("NOSPY ", rw_nospy),
("NOT ", rw_not), ("NOTRACE ", rw_notrace),
("NUMBER ", rw_number), ("OP ", rw_op),
("ORG ", rw_org), ("PARSE ", rw_parse),
("PUT ", rw_put), ("READ ", rw_read),
("REPEAT ", rw_repeat), ("RESET ", rw_reset),
("RETRACT ", rw_retract),
("SEE ", rw_see), ("SEEING ", rw_seeing),
("SEEN ", rw_seen), ("SKIP ", rw_skip),
("SPY ", rw_spy), ("TAB ", rw_tab),
("TELL ", rw_tell), ("TELLING ", rw_telling),
("THRESHOLD", rw_threshold),
("TOLD ", rw_told), ("TRACE ", rw_trace),
("TRUE ", rw_true), ("USER ", rw_user),
("VAR ", rw_var), ("WRITE ", rw_write) );
fail, found : boolean := false;
which : integer;
begin
for i in char_pos(ident(1),1) .. char_pos(ident(1),2) loop
for j in 2..length loop
if ident(j) < words(i).word(j) then
fail := true;
elsif ident(j) > words(i).word(j) then
exit;
else
if j = length then
if j = 9 then
found := true; which := i;
elsif words(i).word(j+1) = ' ' then
found := true; which := i;
else
exit;
end if;
end if;
end if;
exit when fail or found;
end loop;
exit when fail or found;
end loop;
if found then
token := new token_record'(reserved_word, words(which).rw_token);
end if;
reserved := found;
end check_reserved;
begin
if look_ahead_char in 'A'..'Z' then -- it's a Fuzzy Prolog variable
var_flg := true;
end if;
loop
get_char;
ptr := ptr + 1;
if (current_char = '_') or else
(current_char in 'A'..'Z') or else
(current_char in 'a'..'z') or else
(current_char in '0'..'9') then
ident_name(ptr) := current_char;
else
error(pointer,"invalid character in identifier");
skip_rest_of_token;
err_flg := true;
exit;
end if;
if current_char = '_' and (not ((look_ahead_char in 'A'..'Z') or else
(look_ahead_char in 'a'..'z') or else
(look_ahead_char in '0'..'9'))) then
error(pointer,"underlines must separate letters or digits");
skip_rest_of_token;
err_flg := true;
end if;
if current_char in 'a'..'z' then
ident_name(ptr) := character'val(character'pos(current_char) - convert);
end if;
exit when valid_ending(look_ahead_char) or err_flg;
exit when look_ahead_char = '.'; -- required to detect end of clause
end loop;
if ptr <= 9 then
check_reserved(ptr, ident_name(1..ptr), reserved, token);
end if;
if reserved then
if var_flg then
error(pointer,"reserved words may not begin with capital letters");
end if;
elsif var_flg then
ident := new name_record'(ptr, ident_name(1..ptr));
token := new token_record'(variable, ident);
else
ident := new name_record'(ptr, ident_name(1..ptr));
token := new token_record'(identifier, ident);
end if;
end get_identifier;
--
-- get_number -- This subroutine parses tokens which begin with a digit. This
-- means integer and floating point numbers, either of which may
-- be based (legal bases are 2-16).
--
-- Syntax: number ::= value | based_value
-- based_value ::= base '#' value '#'
-- base ::= integer
-- value ::= integer | float
-- integer ::= digit { ['_'] digit }
-- float ::= integer '.' integer
--
separate(token.get_token)
procedure get_number( token : out token_ptr ) is
base : integer := 10; -- default is base 10
digit : integer;
fp_decimal : float := 1.0; -- factor for digits after decimal point
fp_num : float;
int_num : integer := 0; -- initial value is 0
based, done, err_flg, fp : boolean := false;
max_int_div_10 : constant integer := (integer'last/10);
max_int_last_digit : constant integer := (integer'last - 10*max_int_div_10);
--
-- digit_val -- Converts a single character to a number in the current
-- base. No error checking; the character must have been
-- checked by is_a_digit.
--
function digit_val(char : in character; base : in integer) return integer is
char_val : integer;
begin
char_val := character'pos(char) - character'pos('0');
if char_val > 9 then -- letter A-F or a-f
if char >= 'a' then -- lower case
char_val := char_val - 39;
else -- upper case
char_val := char_val - 7;
end if;
end if;
return char_val;
end digit_val;
--
-- is_a_digit -- check a character to see if it is a valid digit in the
-- current base.
--
function is_a_digit(char : in character; base : in integer) return boolean is
char_pos : integer;
begin
char_pos := character'pos(char) - character'pos('0');
if char_pos < 0 then -- below digits
return false;
elsif char_pos < 10 then -- it's a digit
if char_pos < base then -- within the base
return true;
else
return false;
end if;
elsif char_pos < 17 then
return false;
elsif char_pos < (base + 7) then -- a digit A-F in the base
return true;
end if;
char_pos := char_pos - 32; -- check for lower case
if (char_pos < 17) or (char_pos >= (base+7)) then
return false;
else
return true;
end if;
end is_a_digit;
begin -- get_number
loop
get_char; -- get the next numeric char
if is_a_digit(current_char, base) then
digit := digit_val(current_char, base);
if fp then -- we're building a floating point number
fp_decimal := fp_decimal / float(base); -- adjust value of digit
fp_num := fp_num + float(digit) * fp_decimal;
else -- an integer (at least, so far)
if (int_num > max_int_div_10) or
((int_num = max_int_div_10) and (digit > max_int_last_digit)) then
error(pointer,"integer too large");
err_flg := true;
else
int_num := int_num * base + digit;
end if;
end if;
elsif current_char = '_' then -- ignore underline when separating digits
if not is_a_digit(look_ahead_char, base) then
error(pointer,"underline must separate digits");
err_flg := true;
end if;
elsif current_char = '#' then -- deal with based number
if based then -- already working on a based number so this is the end
done := true;
else -- if legal, current value becomes the new base
if fp or (int_num < 2) or (int_num > 16) then -- illegal
error(pointer,"illegal base");
err_flg := true;
else
base := int_num;
int_num := 0;
based := true;
if not is_a_digit(look_ahead_char, base) then
error(pointer,"base declaration must be followed by" &
" an appropriate based number");
err_flg := true;
end if;
end if;
end if;
elsif current_char = '.' then -- deal with floating point number
if fp then
error(pointer,"extra decimal point");
err_flg := true;
elsif not is_a_digit(look_ahead_char, base) then
error(pointer,"decimal point must be followed by digit");
err_flg := true;
else -- current value is to left of decimal point in fp_num
fp_num := float(int_num);
fp := true;
end if;
else -- we don't know what the heck we got . . .
if based then
error(pointer,"illegal character in based number");
else error(pointer,"illegal character in number");
end if;
err_flg := true;
end if;
if valid_ending(look_ahead_char) then -- at end of number
done := true;
end if;
--
-- special case: if a floating point number has been found and a
-- period is coming, it may be the end of the clause.
-- For example: a(X) :- X is 7.0.
-- There appears to be no reasonable way to allow this
-- with integers, however.
--
if fp and (look_ahead_char = '.') then
done := true;
end if;
exit when (err_flg or done);
end loop;
if err_flg then -- skip rest of number; return last valid value
skip_rest_of_token;
end if;
--
-- now define a new token record according to the type of number we've got
--
if fp then
token := new token_record'(float_num, fp_num);
else
token := new token_record'(integer_num, int_num);
end if;
end get_number;
--
-- get_string -- Parses string literals delimited by double quotes ('"').
-- A double quote may be embedded in a string by placing two
-- of them side by side (example: "abc""def" --> abc"def).
-- A null string (zero length) may be specified by not
-- enclosing any characters within the quotes (example:
-- "" --> null string). Strings may not overlap end-of-line.
-- Line length is controlled by package "io"
--
-- syntax: string ::= '"' text_of_string '"'
--
separate(token.get_token)
procedure get_string( token : out token_ptr) is
ptr : integer range 0..io.max_line_length := 0;
string_value : string(1..io.max_line_length) := (others => ' ');
ident : name_ptr;
begin
get_char;
loop
if (look_ahead_char = ascii.cr) or (look_ahead_char = ascii.eot) then
error(pointer,"no terminating '""' for string");
get_char;
exit;
elsif look_ahead_char = '"' then
get_char; -- throw away '"'
if look_ahead_char = '"' then -- embedded '"'
null;
else -- done with this string
exit;
end if;
end if;
get_char;
ptr := ptr + 1;
string_value(ptr) := current_char;
end loop;
if ptr = 0 then
error(pointer,"null string not allowed");
string_value(1) := '?';
ptr := 1;
end if;
ident := new name_record'(ptr, string_value(1..ptr));
token := new token_record'(identifier, ident);
end get_string;
-----------------------------
-- Fuzzy Prolog routines --
-----------------------------
--
-- get_fuzzy_backslash -- The backslash may be the start of either of the
-- non-equality operators in Fuzzy Prolog. These
-- are '\=' and '\=='
--
separate(token.get_token)
procedure get_fuzzy_backslash( token : out token_ptr) is
begin
if look_ahead_char /= '=' then
error(pointer,"must be '\=' or '\=='");
token := new token_record'(is_a => null_token);
skip_rest_of_token;
else
get_char;
if look_ahead_char = '=' then -- '\=='
token := new token_record'(is_a => not_equality);
get_char;
else -- '\='
token := new token_record'(is_a => not_equal);
end if;
end if;
end get_fuzzy_backslash;
--
-- get_fuzzy_colon -- The colon may only be the start of the implication
-- token ':-'
--
separate(token.get_token)
procedure get_fuzzy_colon( token : out token_ptr) is
begin
get_char;
if look_ahead_char /= '-' then
error(pointer,"must be ':-'");
token := new token_record'(is_a => null_token);
skip_rest_of_token;
else
token := new token_record'(is_a => implication);
get_char;
end if;
end get_fuzzy_colon;
--
-- get_fuzzy_equal -- The equal may be an equality test ('='), or may be
-- the start of equality ('=='), less than or equal to
-- ('=<'), or univ ('=..')
--
separate(token.get_token)
procedure get_fuzzy_equal( token : out token_ptr) is
begin
get_char;
if look_ahead_char = '=' then -- equality
token := new token_record'(is_a => equality);
get_char;
elsif look_ahead_char = '<' then -- less than or equal to
token := new token_record'(is_a => less_or_equal);
get_char;
elsif look_ahead_char = '.' then -- may be univ?
get_char;
if look_ahead_char /= '.' then -- oops
error(pointer,"univ must be '=..'");
skip_rest_of_token;
token := new token_record'(is_a => null_token);
else -- univ
token := new token_record'(is_a => univ);
get_char;
end if;
else -- just plain old '=' (equal)
token := new token_record(equal);
end if;
end get_fuzzy_equal;
--
-- get_fuzzy_underline -- The underline in Fuzzy Prolog represents an
-- anonymous variable. This routine merely ensures
-- that the underline is followed by a valid delimiter
--
separate(token.get_token)
procedure get_fuzzy_underline( token : out token_ptr) is
begin
if valid_ending(look_ahead_char) then
token := new token_record(is_a => underline);
else
error(pointer,"identifiers may not begin with an underline");
token := new token_record(is_a => null_token);
end if;
end get_fuzzy_underline;